Attribution statement: 1. I did this homework by myself, with help from the book and the professor.
#reportSampleX.RData –
# import libraries
#create a function to ensure the libraries are imported
EnsurePackage <- function(x){
x <- as.character(x)
if (!require(x,character.only = TRUE)){
install.packages(pkgs=x, repos = "http://cran.us.r-project.org")
require(x, character.only = TRUE)
}
}
EnsurePackage("nlme")
## Loading required package: nlme
load("~/Documents/R/IST772/allSchoolsReportStatus.RData")
# data(allSchoolsReportStatus)
allSchools <- data.frame(allSchoolsReportStatus)
str(allSchools)
## 'data.frame': 7381 obs. of 3 variables:
## $ name : chr "AGUA DULCE ELEMENTARY" "MEADOWLARK ELEMENTARY" "CALIFORNIA SCHOOL FOR THE DEAF-FREMONT" "HIDDEN VALLEY ELEMENTARY" ...
## $ pubpriv : chr "PUBLIC" "PUBLIC" "PUBLIC" "PUBLIC" ...
## $ reported: chr "Y" "Y" "Y" "Y" ...
summary(allSchools)
## name pubpriv reported
## Length:7381 Length:7381 Length:7381
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
# View(allSchools)
dim(allSchools) # 7381 3
## [1] 7381 3
colnames(allSchools) # "name" "pubpriv" "reported"
## [1] "name" "pubpriv" "reported"
# table(allSchools$reported)
# table(allSchools$pubpriv)
# table(allSchools$isreported)
# table(allSchools$ispubpriv)
# define variables
# name <- allSchools$name
# pubpriv <- allSchools$pubpriv
# reported <- allSchools$reported
# allSchools$isreported <- as.numeric(as.factor(allSchools$reported))-1 # Fix the factor
# allSchools$ispubpriv <- as.numeric(as.factor(allSchools$pubpriv))-1 # Adjust the outcome
# head(allSchools)
allSchoolsDF <- data.frame(allSchools$name
,as.numeric(as.factor(allSchools$pubpriv))-1
,as.numeric(as.factor(allSchools$reported))-1,val=1,stringsAsFactors=FALSE)
colnames(allSchoolsDF) <- c("schoolName","isPublicPrivate","isReported","value")
str(allSchoolsDF)
## 'data.frame': 7381 obs. of 4 variables:
## $ schoolName : chr "AGUA DULCE ELEMENTARY" "MEADOWLARK ELEMENTARY" "CALIFORNIA SCHOOL FOR THE DEAF-FREMONT" "HIDDEN VALLEY ELEMENTARY" ...
## $ isPublicPrivate: num 1 1 1 1 1 1 1 1 1 1 ...
## $ isReported : num 1 1 1 1 1 1 1 1 1 1 ...
## $ value : num 1 1 1 1 1 1 1 1 1 1 ...
# head(allSchoolsDF)
# allSchoolsDF[which(allSchoolsDF$schoolName=="AGUA DULCE ELEMENTARY"),]
#create a dataset for Public Schools
publicSchools <- subset(allSchoolsDF,allSchoolsDF$isPublicPrivate==1)
nrow(publicSchools) #5732
## [1] 5732
#create a dataset for Public Schools
privateSchools <- subset(allSchoolsDF,allSchoolsDF$isPublicPrivate==0)
nrow(privateSchools) #1649
## [1] 1649
# PRIVATE = 0
# PUBLIC = 1
table(allSchools$pubpriv)
##
## PRIVATE PUBLIC
## 1649 5732
table(allSchoolsDF$isPublicPrivate)
##
## 0 1
## 1649 5732
table(allSchools$reported)
##
## N Y
## 400 6981
table(allSchoolsDF$isReported)
##
## 0 1
## 400 6981
# install.packages("dlookr") # note: requires version 3.5.2 or higher
EnsurePackage("dlookr")
## Loading required package: dlookr
## Loading required package: mice
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
##
## transform
diagnose_outlier(allSchoolsDF)
plot_outlier(allSchoolsDF)
EnsurePackage("tidyverse")
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 1.0.2
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::collapse() masks nlme::collapse()
## x dplyr::filter() masks mice::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
#data distribution
# violin plot with median points
allSchools %>%
pivot_longer(cols=-c(name), names_to="variable", values_to="value", values_drop_na = TRUE) %>%
ggplot(aes(x=variable, y=value)) + geom_violin(trim=FALSE, fill='steelblue', color="orange") +
facet_wrap( ~ variable, scales="free") +
stat_summary(fun=mean, geom="point", shape=23, size=4,color="red") +
theme_minimal()
# Summary table
agg <- count(allSchoolsDF, isPublicPrivate, isReported)
totSchools <- sum(agg$n)
ratio <- round(agg$n/totSchools,2)
aggDF <- data.frame(agg,ratio)
aggDF
EnsurePackage("gridExtra")
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
ecols <- c("steelblue","orange")
agg_ord <- mutate(agg,
isReported = reorder(isReported, -n, sum),
isPublicPrivate = reorder(isPublicPrivate, -n, sum))
p1 <- ggplot(agg_ord) +
geom_col(aes(x = isPublicPrivate, y = n, fill = isReported)) +
scale_fill_manual(values = ecols)
p2 <- ggplot(agg_ord) +
geom_col(aes(x = isPublicPrivate, y = n, fill = isReported), position = "dodge") +
scale_fill_manual(values = ecols)
grid.arrange(p1, p2, nrow = 1)
barplot(table(allSchoolsDF$isReported),
main="All Schools reported",
xlab="Reported",
ylab="Schools",
border="orange",
col="steelblue",
density=10
)
barplot(table(allSchools$pubpriv),
main="Public/Private Schools",
xlab="School",
ylab="Number of Schools",
border="orange",
col="steelblue",
density=10
)
# total schools split
table(allSchools$pubpriv)
##
## PRIVATE PUBLIC
## 1649 5732
# 5732/7381 ( ~78%)
# PRIVATE = 0
# PUBLIC = 1
# Public Schools table
aggPublic <- count(publicSchools, isPublicPrivate, isReported)
# aggPublic
totPublicSchools <- sum(aggPublic$n)
ratioPublic <- round(aggPublic$n/totPublicSchools,2)
aggPublicDF <- data.frame(aggPublic,ratioPublic)
aggPublicDF
barplot(table(publicSchools$isReported),
main="Public Schools Reported",
xlab="Reported",
ylab="Number of Schools",
border="orange",
col="steelblue",
density=10
)
# total schools split
table(allSchools$pubpriv)
##
## PRIVATE PUBLIC
## 1649 5732
# PRIVATE = 0
# PUBLIC = 1
# 1649/7381 ( ~22%)
# Private Schools table
aggPrivate <- count(privateSchools, isPublicPrivate, isReported)
# aggPrivate
totPrivateSchools <- sum(aggPrivate$n)
ratioPrivate <- round(aggPrivate$n/totPrivateSchools,2)
aggPrivateDF <- data.frame(aggPrivate,ratioPrivate)
aggPrivateDF
barplot(table(privateSchools$isReported),
main="Private Schools Reported",
xlab="Reported",
ylab="Schools",
border="orange",
col="steelblue",
density=10
)
Once, we removed the null data, started analyzing with more balanced data, DTP1 HepB_BD Pol3 Hib3 MCV1
Min. :97.00 Min. :21.00 Min. :90.00 Min. :84.00 Min. :90.00
1st Qu.:98.00 1st Qu.:50.00 1st Qu.:92.75 1st Qu.:93.00 1st Qu.:91.75
Median :98.00 Median :62.00 Median :93.00 Median :93.00 Median :92.00
Mean :98.19 Mean :57.94 Mean :92.88 Mean :92.25 Mean :91.75
3rd Qu.:99.00 3rd Qu.:69.75 3rd Qu.:94.00 3rd Qu.:93.25 3rd Qu.:92.00
Max. :99.00 Max. :74.00 Max. :94.00 Max. :94.00 Max. :93.00
# Import data
load("~/Documents/R/IST772/usVaccines.RData")
str(usVaccines)
## Time-Series [1:38, 1:5] from 1980 to 2017: NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:5] "DTP1" "HepB_BD" "Pol3" "Hib3" ...
usVaccinesTS <- usVaccines
ts.plot(usVaccinesTS)
# View(usVaccinesTS)
EnsurePackage("mice") # missing data
EnsurePackage("visdat") # missing data
## Loading required package: visdat
usVaccinesDF <- data.frame(usVaccinesTS)
str(usVaccinesDF)
## 'data.frame': 38 obs. of 5 variables:
## $ DTP1 : int NA NA NA NA NA NA NA NA NA NA ...
## $ HepB_BD: int NA NA NA NA NA NA NA NA NA NA ...
## $ Pol3 : int 95 96 97 97 97 96 97 24 97 58 ...
## $ Hib3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ MCV1 : int 86 97 97 98 98 97 97 82 98 87 ...
# View(usVaccinesDF)
# missing data
md.pattern(usVaccinesDF, plot=FALSE)
## Pol3 MCV1 Hib3 DTP1 HepB_BD
## 16 1 1 1 1 1 0
## 2 1 1 1 1 0 1
## 5 1 1 1 0 0 2
## 15 1 1 0 0 0 3
## 0 0 15 20 22 57
# missing data visualization
vis_miss(usVaccinesDF)
EnsurePackage("changepoint")
## Loading required package: changepoint
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Successfully loaded changepoint package version 2.2.2
## NOTE: Predefined penalty values changed in version 2.2. Previous penalty values with a postfix 1 i.e. SIC1 are now without i.e. SIC and previous penalties without a postfix i.e. SIC are now with a postfix 0 i.e. SIC0. See NEWS and help files for further details.
EnsurePackage("tseries")
## Loading required package: tseries
EnsurePackage("timeSeries")
## Loading required package: timeSeries
## Loading required package: timeDate
##
## Attaching package: 'timeDate'
## The following objects are masked from 'package:dlookr':
##
## kurtosis, skewness
##
## Attaching package: 'timeSeries'
## The following object is masked from 'package:zoo':
##
## time<-
class(usVaccinesTS)
## [1] "mts" "ts" "matrix"
usVaccinesTSnew <- removeNA(usVaccinesTS)
# usVaccinesTSnew
# start(usVaccinesTSnew)
# end(usVaccinesTSnew)
# frequency(usVaccinesTSnew)
summary(usVaccinesTSnew)
## DTP1 HepB_BD Pol3 Hib3
## Min. :97.00 Min. :21.00 Min. :90.00 Min. :84.00
## 1st Qu.:98.00 1st Qu.:50.00 1st Qu.:92.75 1st Qu.:93.00
## Median :98.00 Median :62.00 Median :93.00 Median :93.00
## Mean :98.19 Mean :57.94 Mean :92.88 Mean :92.25
## 3rd Qu.:99.00 3rd Qu.:69.75 3rd Qu.:94.00 3rd Qu.:93.25
## Max. :99.00 Max. :74.00 Max. :94.00 Max. :94.00
## MCV1
## Min. :90.00
## 1st Qu.:91.75
## Median :92.00
## Mean :91.75
## 3rd Qu.:92.00
## Max. :93.00
plot.ts(usVaccinesTSnew)
dusVaccines <- diff(usVaccinesTSnew)
# dusVaccines
dusMI <- dusVaccines[,2]
dSMIcp <- cpt.var(dusMI)
summary(dSMIcp)
## Created Using changepoint version 2.2.2
## Changepoint type : Change in variance
## Method of analysis : AMOC
## Test Statistic : Normal
## Type of penalty : MBIC with value, 8.124151
## Minimum Segment Length : 2
## Maximum no. of cpts : 1
## Changepoint Locations :
plot(dSMIcp,cpt.col="red",cpt.width=5)
ts.plot(usVaccinesTSnew)
plot(usVaccinesTSnew,col="blue",main="VaccinesTSnew undifferential plot")
plot(dusVaccines,col="orange",main="VaccinesTSnew differential plot")
EnsurePackage("bcp")
## Loading required package: bcp
## Loading required package: grid
# Bayesian Change point analysis
bAPcpm <- bcp(as.vector(usVaccinesTSnew))
plot(bAPcpm)
# summary(bAPcpm)
# reportSample
load("~/Documents/R/IST772/reportSample12.RData")
str(reportSample)
## 'data.frame': 698 obs. of 13 variables:
## $ code : num 6019103 6975866 6116404 6120620 6004691 ...
## $ name : Factor w/ 5890 levels "A. E. ARNOLD ELEMENTARY",..: 4578 4991 426 5423 1370 5344 5026 3075 3591 4249 ...
## $ pubpriv : Factor w/ 2 levels "PRIVATE","PUBLIC": 2 1 2 2 2 2 1 2 2 2 ...
## $ enrollment : num 32 22 137 78 127 83 27 33 18 28 ...
## $ allvaccs : num 90.6 100 93.4 96.2 99.2 ...
## $ conditional: num 9.38 0 1.46 0 0 ...
## $ medical : num 0 0 0 0 0 0 0 0 0 0 ...
## $ religious : num 0 0 5.109 3.846 0.787 ...
## $ dptMiss : num 3.125 0 5.839 3.846 0.787 ...
## $ polMiss : num 3.125 0 5.839 3.846 0.787 ...
## $ mmrMiss : num 9.375 0 6.569 3.846 0.787 ...
## $ hepMiss : num 0 0 5.109 3.846 0.787 ...
## $ varMiss : num 0 0 5.109 3.846 0.787 ...
# head(reportSample)
# View(reportSample)
dim(reportSample)
## [1] 698 13
# summary(reportSample)
EnsurePackage("tidyverse")
#data distribution
# violin plot with median points
reportSample %>%
pivot_longer(cols=-c(code,name,pubpriv), names_to="variable", values_to="value", values_drop_na = TRUE) %>%
ggplot(aes(x=variable, y=value)) + geom_violin(trim=FALSE, fill='steelblue', color="orange") +
facet_wrap( ~ variable, scales="free") +
stat_summary(fun=mean, geom="point", shape=23, size=4,color="red") +
theme_minimal()
EnsurePackage("dlookr") # outlier analysis
# # #outlier analysis
plot_outlier(reportSample)
str(allSchools)
## 'data.frame': 7381 obs. of 3 variables:
## $ name : chr "AGUA DULCE ELEMENTARY" "MEADOWLARK ELEMENTARY" "CALIFORNIA SCHOOL FOR THE DEAF-FREMONT" "HIDDEN VALLEY ELEMENTARY" ...
## $ pubpriv : chr "PUBLIC" "PUBLIC" "PUBLIC" "PUBLIC" ...
## $ reported: chr "Y" "Y" "Y" "Y" ...
str(allSchoolsDF)
## 'data.frame': 7381 obs. of 4 variables:
## $ schoolName : chr "AGUA DULCE ELEMENTARY" "MEADOWLARK ELEMENTARY" "CALIFORNIA SCHOOL FOR THE DEAF-FREMONT" "HIDDEN VALLEY ELEMENTARY" ...
## $ isPublicPrivate: num 1 1 1 1 1 1 1 1 1 1 ...
## $ isReported : num 1 1 1 1 1 1 1 1 1 1 ...
## $ value : num 1 1 1 1 1 1 1 1 1 1 ...
str(reportSample)
## 'data.frame': 698 obs. of 13 variables:
## $ code : num 6019103 6975866 6116404 6120620 6004691 ...
## $ name : Factor w/ 5890 levels "A. E. ARNOLD ELEMENTARY",..: 4578 4991 426 5423 1370 5344 5026 3075 3591 4249 ...
## $ pubpriv : Factor w/ 2 levels "PRIVATE","PUBLIC": 2 1 2 2 2 2 1 2 2 2 ...
## $ enrollment : num 32 22 137 78 127 83 27 33 18 28 ...
## $ allvaccs : num 90.6 100 93.4 96.2 99.2 ...
## $ conditional: num 9.38 0 1.46 0 0 ...
## $ medical : num 0 0 0 0 0 0 0 0 0 0 ...
## $ religious : num 0 0 5.109 3.846 0.787 ...
## $ dptMiss : num 3.125 0 5.839 3.846 0.787 ...
## $ polMiss : num 3.125 0 5.839 3.846 0.787 ...
## $ mmrMiss : num 9.375 0 6.569 3.846 0.787 ...
## $ hepMiss : num 0 0 5.109 3.846 0.787 ...
## $ varMiss : num 0 0 5.109 3.846 0.787 ...
# Pre-Processing
# 1. Add new column "isPublicPrivate" with boolean value as 0 or 1
reportSampleDF <- data.frame(reportSample
,as.numeric(as.factor(reportSample$pubpriv))-1
,val=1,stringsAsFactors=FALSE)
reportSampleDF$pubpriv=as.character(reportSampleDF$pubpriv)
reportSampleDF$name=as.character(reportSampleDF$name)
names(reportSampleDF)[14] <- "isPublicPrivate"
str(reportSampleDF)
## 'data.frame': 698 obs. of 15 variables:
## $ code : num 6019103 6975866 6116404 6120620 6004691 ...
## $ name : chr "SELMA AVENUE ELEMENTARY" "ST. PIUS X" "BENJAMIN FRANKLIN ELEMENTARY" "UNIVERSITY PREPARATION SCHOOL AT CSU CHANNEL ISLANDS" ...
## $ pubpriv : chr "PUBLIC" "PRIVATE" "PUBLIC" "PUBLIC" ...
## $ enrollment : num 32 22 137 78 127 83 27 33 18 28 ...
## $ allvaccs : num 90.6 100 93.4 96.2 99.2 ...
## $ conditional : num 9.38 0 1.46 0 0 ...
## $ medical : num 0 0 0 0 0 0 0 0 0 0 ...
## $ religious : num 0 0 5.109 3.846 0.787 ...
## $ dptMiss : num 3.125 0 5.839 3.846 0.787 ...
## $ polMiss : num 3.125 0 5.839 3.846 0.787 ...
## $ mmrMiss : num 9.375 0 6.569 3.846 0.787 ...
## $ hepMiss : num 0 0 5.109 3.846 0.787 ...
## $ varMiss : num 0 0 5.109 3.846 0.787 ...
## $ isPublicPrivate: num 1 0 1 1 1 1 0 1 1 1 ...
## $ val : num 1 1 1 1 1 1 1 1 1 1 ...
# table(reportSampleDF$pubpriv)
barplot(table(reportSampleDF$pubpriv),
main="Public/Private Schools reported",
xlab="School",
ylab="Number of Schools",
border="orange",
col="steelblue",
density=10
)
# Public Schools table
aggSamples <- count(reportSampleDF,isPublicPrivate)
totSamples <- sum(aggSamples$n)
ratioPublic <- round(aggSamples$n/totSamples,2)
aggSamplesDF <- data.frame(aggSamples,ratioPublic)
aggSamplesDF
#box plot to compare the tensions
boxplot(allvaccs~pubpriv, data=reportSampleDF,
border="orange",
col="steelblue",
freq=FALSE,
xlab="",
las=1,
breaks=10,
# notch = TRUE,
horizontal = FALSE ,main=" vaccination rates (allvaccs) between public and private schools")
#box plot to compare the tensions
boxplot(allvaccs~pubpriv, data=reportSampleDF,
border="orange",
col="steelblue",
freq=FALSE,
xlab="",
las=1,
breaks=10,
# notch = TRUE,
horizontal = FALSE ,main=" vaccination rates (allvaccs) between public and private schools")
EnsurePackage("zoo")
reportSampleDF$allvaccs <- na.aggregate(reportSampleDF$allvaccs)
sum(is.na(reportSampleDF$allvaccs))
## [1] 0
# Confidence Interval Test
t.test(allvaccs~pubpriv, data=reportSampleDF)
##
## Welch Two Sample t-test
##
## data: allvaccs by pubpriv
## t = -3.7023, df = 179.7, p-value = 0.000284
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8.939025 -2.723307
## sample estimates:
## mean in group PRIVATE mean in group PUBLIC
## 84.45064 90.28181
# Bayesian distribution
EnsurePackage("BEST")
## Loading required package: BEST
## Loading required package: HDInterval
bayesAllvaccs <- BESTmcmc(reportSampleDF$allvaccs[reportSampleDF$pubpriv=="PUBLIC"]
,reportSampleDF$allvaccs[reportSampleDF$pubpriv=="PRIVATE"])
## Waiting for parallel processing to complete...
## done.
bayesAllvaccs
diffBest <- bayesAllvaccs$mu1 - bayesAllvaccs$mu2
# summary(diffBest)
quantile(diffBest, c(0.025, 0.5, 0.975))
## 2.5% 50% 97.5%
## 0.8043082 3.2188500 5.8351416
#Bayesian probability distribution of the differences between two means
plot(bayesAllvaccs)
abline(v=quantile(diffBest,0.025),col="blue") # low end HDI differences in proportion
abline(v=quantile(diffBest,0.975),col="green") # high end HDI differences in proportion
abline(v=quantile(diffBest,0.50),col="red") # median in proportion
plotAll(bayesAllvaccs)
#box plot to compare the tensions
boxplot(medical~pubpriv, data=reportSampleDF,
border="orange",
col="steelblue",
freq=FALSE,
xlab="",
las=1,
# breaks=10,
# notch = TRUE,
horizontal = FALSE ,main=" medical excemption between public and private schools")
EnsurePackage("zoo")
reportSampleDF$medical <- na.aggregate(reportSampleDF$medical)
sum(is.na(reportSampleDF$medical))
## [1] 0
# Confidence Interval Test
t.test(medical~pubpriv, data=reportSampleDF)
##
## Welch Two Sample t-test
##
## data: medical by pubpriv
## t = 1.3321, df = 159.92, p-value = 0.1847
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.0870064 0.4475860
## sample estimates:
## mean in group PRIVATE mean in group PUBLIC
## 0.3169112 0.1366214
# Bayesian distribution
EnsurePackage("BEST")
bayesmedical <- BESTmcmc(reportSampleDF$medical[reportSampleDF$pubpriv=="PUBLIC"]
,reportSampleDF$medical[reportSampleDF$pubpriv=="PRIVATE"])
## Waiting for parallel processing to complete...done.
bayesmedical
bayesmedicalDiffBest <- bayesmedical$mu1 - bayesmedical$mu2
# summary(diffBest)
quantile(bayesmedicalDiffBest, c(0.025, 0.5, 0.975))
## 2.5% 50% 97.5%
## -1.190542e-04 1.793326e-07 1.185549e-04
#Bayesian probability distribution of the differences between two means
plot(bayesmedical)
abline(v=quantile(bayesmedicalDiffBest,0.025),col="blue") # low end HDI differences in proportion
abline(v=quantile(bayesmedicalDiffBest,0.975),col="green") # high end HDI differences in proportion
abline(v=quantile(bayesmedicalDiffBest,0.50),col="red") # median in proportion
plotAll(bayesmedical)
#box plot to compare the tensions
boxplot(religious~pubpriv, data=reportSampleDF,
border="orange",
col="steelblue",
freq=FALSE,
xlab="",
las=1,
breaks=10,
# notch = TRUE,
horizontal = FALSE ,main=" religious belief between public and private schools")
EnsurePackage("zoo")
reportSampleDF$religious <- na.aggregate(reportSampleDF$religious)
sum(is.na(reportSampleDF$religious))
## [1] 0
# Confidence Interval Test
t.test(religious~pubpriv, data=reportSampleDF)
##
## Welch Two Sample t-test
##
## data: religious by pubpriv
## t = 1.9052, df = 178.55, p-value = 0.05836
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.09940198 5.66035299
## sample estimates:
## mean in group PRIVATE mean in group PUBLIC
## 6.623709 3.843233
# Bayesian distribution
EnsurePackage("BEST")
bayesreligious <- BESTmcmc(reportSampleDF$religious[reportSampleDF$pubpriv=="PUBLIC"]
,reportSampleDF$religious[reportSampleDF$pubpriv=="PRIVATE"])
## Waiting for parallel processing to complete...done.
bayesreligious
bayesreligiousDiffBest <- bayesreligious$mu1 - bayesreligious$mu2
# summary(diffBest)
quantile(bayesreligiousDiffBest, c(0.025, 0.5, 0.975))
## 2.5% 50% 97.5%
## 0.4322757 0.5843983 0.7481946
#Bayesian probability distribution of the differences between two means
plot(bayesreligious)
abline(v=quantile(bayesreligiousDiffBest,0.025),col="blue") # low end HDI differences in proportion
abline(v=quantile(bayesreligiousDiffBest,0.975),col="green") # high end HDI differences in proportion
abline(v=quantile(bayesreligiousDiffBest,0.50),col="red") # median in proportion
plotAll(bayesreligious)
summary(reportSampleDF)
## code name pubpriv enrollment
## Min. : 100362 Length:698 Length:698 Min. : 10.00
## 1st Qu.:6015501 Class :character Class :character 1st Qu.: 42.25
## Median :6043137 Mode :character Mode :character Median : 75.00
## Mean :5588410 Mean : 76.11
## 3rd Qu.:6116305 3rd Qu.:105.00
## Max. :7103161 Max. :221.00
##
## allvaccs conditional medical religious
## Min. : 8.00 Min. : 0.000 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 86.17 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 93.33 Median : 3.030 Median : 0.0000 Median : 1.087
## Mean : 89.01 Mean : 6.931 Mean : 0.1759 Mean : 4.449
## 3rd Qu.: 97.46 3rd Qu.: 8.677 3rd Qu.: 0.0000 3rd Qu.: 4.353
## Max. :100.00 Max. :81.818 Max. :14.2857 Max. :162.500
##
## dptMiss polMiss mmrMiss hepMiss
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 1.699 1st Qu.: 1.493 1st Qu.: 1.429 1st Qu.: 0.722
## Median : 4.918 Median : 4.587 Median : 5.000 Median : 3.073
## Mean : 8.638 Mean : 8.229 Mean : 8.793 Mean : 5.909
## 3rd Qu.:10.960 3rd Qu.:10.526 3rd Qu.:11.111 3rd Qu.: 7.767
## Max. :81.250 Max. :81.250 Max. :81.250 Max. :79.070
## NA's :1 NA's :1
## varMiss isPublicPrivate val
## Min. : 0.000 Min. :0.0000 Min. :1
## 1st Qu.: 0.000 1st Qu.:1.0000 1st Qu.:1
## Median : 2.485 Median :1.0000 Median :1
## Mean : 5.371 Mean :0.7822 Mean :1
## 3rd Qu.: 6.667 3rd Qu.:1.0000 3rd Qu.:1
## Max. :79.070 Max. :1.0000 Max. :1
##
First, general linear model is built using the formula below with link function as “binomial” invoking inverse logit/logististic regression glm(isPublicPrivate ~ conditional + medical + religious ,data=reportSampleDF,family = binomial(link = “logit”))
# dim(reportSampleDF)
# Create glm model
options(scipen=999) # turn-off scientific notation like 1e+48
school.glm <- glm(isPublicPrivate ~ conditional + medical + religious ,data=reportSampleDF,family = binomial(link = "logit"))
summary(school.glm)
##
## Call:
## glm(formula = isPublicPrivate ~ conditional + medical + religious,
## family = binomial(link = "logit"), data = reportSampleDF)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8860 0.6083 0.6360 0.6857 1.4426
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.593445 0.122455 13.012 < 0.0000000000000002 ***
## conditional -0.025150 0.007917 -3.177 0.00149 **
## medical -0.188283 0.091933 -2.048 0.04056 *
## religious -0.017045 0.006992 -2.438 0.01477 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 731.59 on 697 degrees of freedom
## Residual deviance: 712.46 on 694 degrees of freedom
## AIC: 720.46
##
## Number of Fisher Scoring iterations: 4
hist(residuals(school.glm))
# Create lm model
options(scipen=999) # turn-off scientific notation like 1e+48
school.lm <- lm(isPublicPrivate ~ conditional + medical + religious ,data=reportSampleDF)
summary(school.lm)
##
## Call:
## lm(formula = isPublicPrivate ~ conditional + medical + religious,
## data = reportSampleDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8385 0.1615 0.1809 0.2143 0.6133
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.838508 0.019762 42.431 < 0.0000000000000002 ***
## conditional -0.004863 0.001479 -3.289 0.00106 **
## medical -0.039131 0.017109 -2.287 0.02249 *
## religious -0.003525 0.001306 -2.700 0.00711 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4074 on 694 degrees of freedom
## Multiple R-squared: 0.03138, Adjusted R-squared: 0.02719
## F-statistic: 7.494 on 3 and 694 DF, p-value: 0.00006093
EnsurePackage("VIF")
## Loading required package: VIF
EnsurePackage("car")
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:VIF':
##
## vif
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vif(school.lm)
## conditional medical religious
## 1.001352 1.000499 1.000863
vif(school.glm)
## conditional medical religious
## 1.004026 1.001795 1.002371
EnsurePackage("MCMCpack")
## Loading required package: MCMCpack
## Loading required package: coda
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## ##
## ## Markov Chain Monte Carlo Package (MCMCpack)
## ## Copyright (C) 2003-2020 Andrew D. Martin, Kevin M. Quinn, and Jong Hee Park
## ##
## ## Support provided by the U.S. National Science Foundation
## ## (Grants SES-0350646 and SES-0350613)
## ##
#histogram on residuals chile.glm
hist(residuals(school.glm))
#Chi-Square analysis on logistic regression
anova(school.glm, test="Chisq")
# Convert the log odds for the coefficient on the predictor into regular odds
exp(coef(school.glm))
## (Intercept) conditional medical religious
## 4.9206691 0.9751638 0.8283803 0.9830997
#confusion matrix
table(round(predict(school.glm,type="response")),reportSampleDF$isPublicPrivate)
##
## 0 1
## 0 6 4
## 1 146 542
EnsurePackage("MASS") # AIC
stepOut <- stepAIC(school.glm)
## Start: AIC=720.46
## isPublicPrivate ~ conditional + medical + religious
##
## Df Deviance AIC
## <none> 712.46 720.46
## - medical 1 716.82 722.82
## - religious 1 718.55 724.55
## - conditional 1 722.10 728.10
stepOut$anova
#bayesian estimation of logistic regression
options(scipen=999) # turn-off scientific notation like 1e+48
EnsurePackage("BayesFactor")
## Loading required package: BayesFactor
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## ************
## Welcome to BayesFactor 0.9.12-4.2. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
##
## Type BFManual() to open the manual.
## ************
schoolMCMCout <- lmBF(isPublicPrivate ~ conditional + medical + religious ,data=reportSampleDF, posterior=TRUE, iterations=10000)
summary(schoolMCMCout)
##
## Iterations = 1:10000
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 10000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## mu 0.782291 0.015447 0.00015447 0.00015447
## conditional -0.004708 0.001450 0.00001450 0.00001450
## medical -0.037980 0.016760 0.00016760 0.00016760
## religious -0.003414 0.001297 0.00001297 0.00001320
## sig2 0.165941 0.008886 0.00008886 0.00009047
## g 0.077273 0.116194 0.00116194 0.00116194
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## mu 0.752232 0.771905 0.782344 0.792765 0.8128309
## conditional -0.007581 -0.005678 -0.004701 -0.003738 -0.0018760
## medical -0.070183 -0.049255 -0.038062 -0.026418 -0.0050365
## religious -0.005981 -0.004269 -0.003425 -0.002545 -0.0008822
## sig2 0.149463 0.159814 0.165602 0.171783 0.1842068
## g 0.014333 0.029616 0.047666 0.083318 0.3227107
# schoolMCMCout
rsqList <- 1 - (schoolMCMCout[,"sig2"] / var(reportSampleDF$isPublicPrivate))
MeanrsqList <- mean(rsqList)
MeanrsqList
## [1] 0.02724264
quantile(rsqList,c(0.025))
## 2.5%
## -0.07983559
quantile(rsqList,c(0.975))
## 97.5%
## 0.1238336
# Bayes factor analysis
schoolMCMCoutBF <- lmBF(isPublicPrivate ~ conditional + medical + religious ,data=reportSampleDF)
summary(schoolMCMCoutBF)
## Bayes factor analysis
## --------------
## [1] conditional + medical + religious : 80.08021 ±0.01%
##
## Against denominator:
## Intercept only
## ---
## Bayes factor type: BFlinearModel, JZS
# histogram of 95% HDI mean differences on conditional | overlaps with 0
hist(schoolMCMCout[,"conditional"],col="#CBB43D")
abline(v=quantile(schoolMCMCout[,"conditional"],c(0.025)),col="blue")
abline(v=quantile(schoolMCMCout[,"conditional"],c(0.975)),col="green")
# histogram of 95% HDI mean differences on medical | overlaps with 0
hist(schoolMCMCout[,"medical"],col="#4DAFD4")
abline(v=quantile(schoolMCMCout[,"medical"],c(0.025)),col="blue")
abline(v=quantile(schoolMCMCout[,"medical"],c(0.975)),col="green")
# histogram of 95% HDI mean differences on religious | overlaps with 0
hist(schoolMCMCout[,"religious"],col="pink")
abline(v=quantile(schoolMCMCout[,"religious"],c(0.025)),col="blue")
abline(v=quantile(schoolMCMCout[,"religious"],c(0.975)),col="green")
Having strong F score and p-value < alpha level ; We can reject the NULL hypothesis.
VIF() - to test for multi colleniarity; For the given predictor (conditional), multicollinearity can assessed by computing a score called the variance inflation factor (or VIF). dptMiss and polMiss clearly suggesting signs of multi colleniarity. So, its recommened to ignore these values.However, based on the correlatio coeffient values, if we reject polMiss then there could be better output. dptMiss polMiss mmrMiss varMiss 14.262805 16.377165 7.110362 3.513100
# Pre-Processing
EnsurePackage("mice") # missing data
EnsurePackage("visdat") # missing data
EnsurePackage("zoo")
summary(reportSampleDF)
## code name pubpriv enrollment
## Min. : 100362 Length:698 Length:698 Min. : 10.00
## 1st Qu.:6015501 Class :character Class :character 1st Qu.: 42.25
## Median :6043137 Mode :character Mode :character Median : 75.00
## Mean :5588410 Mean : 76.11
## 3rd Qu.:6116305 3rd Qu.:105.00
## Max. :7103161 Max. :221.00
##
## allvaccs conditional medical religious
## Min. : 8.00 Min. : 0.000 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 86.17 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 93.33 Median : 3.030 Median : 0.0000 Median : 1.087
## Mean : 89.01 Mean : 6.931 Mean : 0.1759 Mean : 4.449
## 3rd Qu.: 97.46 3rd Qu.: 8.677 3rd Qu.: 0.0000 3rd Qu.: 4.353
## Max. :100.00 Max. :81.818 Max. :14.2857 Max. :162.500
##
## dptMiss polMiss mmrMiss hepMiss
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 1.699 1st Qu.: 1.493 1st Qu.: 1.429 1st Qu.: 0.722
## Median : 4.918 Median : 4.587 Median : 5.000 Median : 3.073
## Mean : 8.638 Mean : 8.229 Mean : 8.793 Mean : 5.909
## 3rd Qu.:10.960 3rd Qu.:10.526 3rd Qu.:11.111 3rd Qu.: 7.767
## Max. :81.250 Max. :81.250 Max. :81.250 Max. :79.070
## NA's :1 NA's :1
## varMiss isPublicPrivate val
## Min. : 0.000 Min. :0.0000 Min. :1
## 1st Qu.: 0.000 1st Qu.:1.0000 1st Qu.:1
## Median : 2.485 Median :1.0000 Median :1
## Mean : 5.371 Mean :0.7822 Mean :1
## 3rd Qu.: 6.667 3rd Qu.:1.0000 3rd Qu.:1
## Max. :79.070 Max. :1.0000 Max. :1
##
# missing data
md.pattern(reportSampleDF, plot=FALSE)
## code name pubpriv enrollment allvaccs conditional medical religious dptMiss
## 696 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0
## hepMiss varMiss isPublicPrivate val polMiss mmrMiss
## 696 1 1 1 1 1 1 0
## 1 1 1 1 1 1 0 1
## 1 1 1 1 1 0 1 1
## 0 0 0 0 1 1 2
# missing data visualization
vis_miss(reportSampleDF)
#replace missing values
reportSampleDF$polMiss <- na.aggregate(reportSampleDF$polMiss)
sum(is.na(reportSampleDF$polMiss))
## [1] 0
reportSampleDF$mmrMiss <- na.aggregate(reportSampleDF$mmrMiss)
sum(is.na(reportSampleDF$mmrMiss))
## [1] 0
options(scipen=999) # turn-off scientific notation like 1e+48
# summary(reportSampleDF)
# str(reportSampleDF)
# Crete lm model
conditional.lm <- lm(conditional ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF)
summary(conditional.lm)
##
## Call:
## lm(formula = conditional ~ dptMiss + polMiss + mmrMiss + varMiss,
## data = reportSampleDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.797 -1.952 -1.042 0.553 42.043
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.04211 0.27139 3.84 0.000134 ***
## dptMiss 0.65200 0.07326 8.90 < 0.0000000000000002 ***
## polMiss 0.08391 0.07992 1.05 0.294117
## mmrMiss 0.71589 0.04923 14.54 < 0.0000000000000002 ***
## varMiss -1.25271 0.04485 -27.93 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.565 on 693 degrees of freedom
## Multiple R-squared: 0.7176, Adjusted R-squared: 0.716
## F-statistic: 440.2 on 4 and 693 DF, p-value: < 0.00000000000000022
options(scipen=999) # turn-off scientific notation like 1e+48
# summary(reportSampleDF)
# str(reportSampleDF)
# Crete glm model
# conditional.glm <- glm(conditional ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF,family = binomial(link = "logit"))
## Error in eval(family$initialize) : y values must be 0 <= y <= 1
EnsurePackage("VIF")
EnsurePackage("car")
vif(conditional.lm)
## dptMiss polMiss mmrMiss varMiss
## 14.262805 16.377165 7.110362 3.513100
# vif(conditional.glm)
EnsurePackage("MCMCpack")
#histogram on residuals chile.glm
hist(residuals(conditional.lm))
#Chi-Square analysis on logistic regression
anova(conditional.lm, test="Chisq")
# Convert the log odds for the coefficient on the predictor into regular odds
exp(coef(conditional.lm))
## (Intercept) dptMiss polMiss mmrMiss varMiss
## 2.835203 1.919370 1.087528 2.046004 0.285728
EnsurePackage("MASS") # AIC
stepOut_conditional <- stepAIC(conditional.lm)
## Start: AIC=2401.29
## conditional ~ dptMiss + polMiss + mmrMiss + varMiss
##
## Df Sum of Sq RSS AIC
## - polMiss 1 34.1 21498 2400.4
## <none> 21464 2401.3
## - dptMiss 1 2453.3 23917 2474.8
## - mmrMiss 1 6549.0 28013 2585.2
## - varMiss 1 24161.2 45625 2925.6
##
## Step: AIC=2400.4
## conditional ~ dptMiss + mmrMiss + varMiss
##
## Df Sum of Sq RSS AIC
## <none> 21498 2400.4
## - dptMiss 1 6058.6 27557 2571.7
## - mmrMiss 1 7955.0 29453 2618.2
## - varMiss 1 24614.9 46113 2931.1
stepOut_conditional$anova
# Bayesian regression with Posterior iterations
options(scipen=999) # turn-off scientific notation like 1e+48
EnsurePackage("BayesFactor")
MCMCout_conditional <- lmBF(conditional ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF, posterior=TRUE, iterations=10000)
summary(MCMCout_conditional)
##
## Iterations = 1:10000
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 10000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## mu 6.92979 0.21334 0.0021334 0.0021334
## dptMiss 0.65122 0.07321 0.0007321 0.0007321
## polMiss 0.08319 0.08035 0.0008035 0.0008035
## mmrMiss 0.71320 0.04949 0.0004949 0.0004825
## varMiss -1.24892 0.04463 0.0004463 0.0004463
## sig2 31.12577 1.69441 0.0169441 0.0173560
## g 0.88361 1.37190 0.0137190 0.0137190
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## mu 6.51638 6.78795 6.93113 7.0726 7.3419
## dptMiss 0.50857 0.60160 0.65080 0.7002 0.7941
## polMiss -0.07649 0.02818 0.08234 0.1370 0.2420
## mmrMiss 0.61480 0.68008 0.71371 0.7462 0.8095
## varMiss -1.33578 -1.27952 -1.24881 -1.2180 -1.1639
## sig2 27.94425 29.95871 31.06933 32.2215 34.5952
## g 0.20309 0.39927 0.59911 0.9847 3.1671
rsqList <- 1 - (MCMCout_conditional[,"sig2"] / var(reportSampleDF$conditional))
MeanrsqList <- mean(rsqList)
MeanrsqList
## [1] 0.7145511
quantile(rsqList,c(0.025))
## 2.5%
## 0.6827336
quantile(rsqList,c(0.975))
## 97.5%
## 0.7437283
# Bayes factor analysis
conditionalMCMCoutBF <- lmBF(conditional ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF)
summary(conditionalMCMCoutBF)
## Bayes factor analysis
## --------------
## [1] dptMiss + polMiss + mmrMiss + varMiss : 44389270741483073102670811534531134153720679431604137680479811655382002050441444874667850084091485973521762455768514182868334091493893173260895732465272807068522715246876676770274213888 ±0%
##
## Against denominator:
## Intercept only
## ---
## Bayes factor type: BFlinearModel, JZS
# histogram of 95% HDI mean differences on dptMiss | overlaps with 0
hist(MCMCout_conditional[,"dptMiss"],col="#CBB43D")
abline(v=quantile(MCMCout_conditional[,"dptMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_conditional[,"dptMiss"],c(0.975)),col="green")
# histogram of 95% HDI mean differences on polMiss | overlaps with 0
hist(MCMCout_conditional[,"polMiss"],col="#4DAFD4")
abline(v=quantile(MCMCout_conditional[,"polMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_conditional[,"polMiss"],c(0.975)),col="green")
# histogram of 95% HDI mean differences on mmrMiss | overlaps with 0
hist(MCMCout_conditional[,"mmrMiss"],col="pink")
abline(v=quantile(MCMCout_conditional[,"mmrMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_conditional[,"mmrMiss"],c(0.975)),col="green")
# histogram of 95% HDI mean differences on varMiss | overlaps with 0
hist(MCMCout_conditional[,"varMiss"],col="orange")
abline(v=quantile(MCMCout_conditional[,"varMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_conditional[,"varMiss"],c(0.975)),col="green")
options(scipen=999) # turn-off scientific notation like 1e+48
# summary(reportSampleDF)
# str(reportSampleDF)
# Crete lm model
medical.lm <- lm(medical ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF)
summary(medical.lm)
##
## Call:
## lm(formula = medical ~ dptMiss + polMiss + mmrMiss + varMiss,
## data = reportSampleDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0651 -0.1806 -0.1368 -0.1202 13.9291
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.124270 0.043760 2.840 0.00465 **
## dptMiss 0.006927 0.011813 0.586 0.55781
## polMiss -0.014374 0.012886 -1.115 0.26504
## mmrMiss 0.001800 0.007938 0.227 0.82069
## varMiss 0.017546 0.007232 2.426 0.01552 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8974 on 693 degrees of freedom
## Multiple R-squared: 0.01608, Adjusted R-squared: 0.0104
## F-statistic: 2.831 on 4 and 693 DF, p-value: 0.02392
EnsurePackage("VIF")
EnsurePackage("car")
vif(medical.lm)
## dptMiss polMiss mmrMiss varMiss
## 14.262805 16.377165 7.110362 3.513100
EnsurePackage("MASS") # AIC
stepOut_medical <- stepAIC(medical.lm)
## Start: AIC=-146.17
## medical ~ dptMiss + polMiss + mmrMiss + varMiss
##
## Df Sum of Sq RSS AIC
## - mmrMiss 1 0.0414 558.11 -148.12
## - dptMiss 1 0.2769 558.34 -147.83
## - polMiss 1 1.0020 559.07 -146.92
## <none> 558.07 -146.17
## - varMiss 1 4.7397 562.81 -142.27
##
## Step: AIC=-148.12
## medical ~ dptMiss + polMiss + varMiss
##
## Df Sum of Sq RSS AIC
## - dptMiss 1 0.3362 558.44 -149.70
## - polMiss 1 0.9906 559.10 -148.88
## <none> 558.11 -148.12
## - varMiss 1 4.9939 563.10 -143.90
##
## Step: AIC=-149.7
## medical ~ polMiss + varMiss
##
## Df Sum of Sq RSS AIC
## - polMiss 1 1.0145 559.46 -150.43
## <none> 558.44 -149.70
## - varMiss 1 5.5927 564.04 -144.75
##
## Step: AIC=-150.43
## medical ~ varMiss
##
## Df Sum of Sq RSS AIC
## <none> 559.46 -150.43
## - varMiss 1 7.7258 567.18 -142.86
stepOut_medical$anova
EnsurePackage("MCMCpack")
#histogram on residuals medical.lm
hist(residuals(medical.lm))
#Chi-Square analysis on logistic regression
anova(medical.lm, test="Chisq")
# Convert the log odds for the coefficient on the predictor into regular odds
exp(coef(medical.lm))
## (Intercept) dptMiss polMiss mmrMiss varMiss
## 1.1323218 1.0069507 0.9857287 1.0018016 1.0177005
#confusion matrix
# table(round(predict(medical.lm,type="response")),reportSampleDF$medical)
options(scipen=999) # turn-off scientific notation like 1e+48
EnsurePackage("BayesFactor")
MCMCout_medical <- lmBF(medical ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF, posterior=TRUE, iterations=10000)
summary(MCMCout_medical)
##
## Iterations = 1:10000
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 10000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## mu 0.176513 0.034189 0.00034189 0.00036380
## dptMiss 0.006635 0.011378 0.00011378 0.00011378
## polMiss -0.013715 0.012598 0.00012598 0.00012598
## mmrMiss 0.001726 0.007681 0.00007681 0.00007681
## varMiss 0.016715 0.007136 0.00007136 0.00007104
## sig2 0.803768 0.043340 0.00043340 0.00043612
## g 0.049418 0.069260 0.00069260 0.00069260
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## mu 0.109806 0.153174 0.176351 0.199960 0.24327
## dptMiss -0.015334 -0.001110 0.006665 0.014246 0.02944
## polMiss -0.038650 -0.022041 -0.013680 -0.005383 0.01132
## mmrMiss -0.013355 -0.003397 0.001663 0.006854 0.01689
## varMiss 0.002626 0.011880 0.016714 0.021498 0.03072
## sig2 0.723826 0.773756 0.802510 0.832234 0.89318
## g 0.011025 0.021784 0.033495 0.055239 0.17953
rsqList <- 1 - (MCMCout_medical[,"sig2"] / var(reportSampleDF$medical))
MeanrsqList <- mean(rsqList)
MeanrsqList
## [1] 0.01226551
quantile(rsqList,c(0.025))
## 2.5%
## -0.09760797
quantile(rsqList,c(0.975))
## 97.5%
## 0.1105045
# Bayes Factor Analysis
medicalBF <- lmBF(medical ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF)
summary(medicalBF)
## Bayes factor analysis
## --------------
## [1] dptMiss + polMiss + mmrMiss + varMiss : 0.07607594 ±0.01%
##
## Against denominator:
## Intercept only
## ---
## Bayes factor type: BFlinearModel, JZS
# histogram of 95% HDI mean differences on dptMiss | overlaps with 0
hist(MCMCout_medical[,"dptMiss"],col="#CBB43D")
abline(v=quantile(MCMCout_medical[,"dptMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_medical[,"dptMiss"],c(0.975)),col="green")
# histogram of 95% HDI mean differences on polMiss | overlaps with 0
hist(MCMCout_medical[,"polMiss"],col="#4DAFD4")
abline(v=quantile(MCMCout_medical[,"polMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_medical[,"polMiss"],c(0.975)),col="green")
# histogram of 95% HDI mean differences on mmrMiss | overlaps with 0
hist(MCMCout_medical[,"mmrMiss"],col="pink")
abline(v=quantile(MCMCout_medical[,"mmrMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_medical[,"mmrMiss"],c(0.975)),col="green")
# histogram of 95% HDI mean differences on varMiss | overlaps with 0
hist(MCMCout_medical[,"varMiss"],col="orange")
abline(v=quantile(MCMCout_medical[,"varMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_medical[,"varMiss"],c(0.975)),col="green")
Step wise analysis results, Initial Model: religious ~ dptMiss + polMiss + mmrMiss + varMiss
Final Model: religious ~ dptMiss + polMiss + varMiss Step Df Deviance Resid. Df Resid. Dev AIC 1 693 50352.60 2996.453 2 - mmrMiss 1 28.25718 694 50380.86 2994.845
#multiple linear regression
options(scipen=999) # turn-off scientific notation like 1e+48
# summary(reportSampleDF)
# str(reportSampleDF)
# Crete lm model
religious.lm <- lm(religious ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF)
summary(religious.lm)
##
## Call:
## lm(formula = religious ~ dptMiss + polMiss + mmrMiss + varMiss,
## data = reportSampleDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.639 -0.762 0.282 0.698 104.436
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.24661 0.41567 -0.593 0.5532
## dptMiss -0.24685 0.11221 -2.200 0.0281 *
## polMiss 0.22793 0.12240 1.862 0.0630 .
## mmrMiss -0.04702 0.07541 -0.624 0.5331
## varMiss 0.99891 0.06870 14.541 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.524 on 693 degrees of freedom
## Multiple R-squared: 0.483, Adjusted R-squared: 0.4801
## F-statistic: 161.9 on 4 and 693 DF, p-value: < 0.00000000000000022
EnsurePackage("VIF")
EnsurePackage("car")
vif(religious.lm)
## dptMiss polMiss mmrMiss varMiss
## 14.262805 16.377165 7.110362 3.513100
EnsurePackage("MASS") # AIC
stepOut_religious <- stepAIC(religious.lm)
## Start: AIC=2996.45
## religious ~ dptMiss + polMiss + mmrMiss + varMiss
##
## Df Sum of Sq RSS AIC
## - mmrMiss 1 28.3 50381 2994.8
## <none> 50353 2996.5
## - polMiss 1 252.0 50605 2997.9
## - dptMiss 1 351.7 50704 2999.3
## - varMiss 1 15362.7 65715 3180.3
##
## Step: AIC=2994.84
## religious ~ dptMiss + polMiss + varMiss
##
## Df Sum of Sq RSS AIC
## <none> 50381 2994.8
## - polMiss 1 223.9 50605 2995.9
## - dptMiss 1 410.5 50791 2998.5
## - varMiss 1 15526.3 65907 3180.4
stepOut_religious$anova
EnsurePackage("MCMCpack")
#histogram on residuals religious.lm
hist(residuals(religious.lm))
#Chi-Square analysis on logistic regression
anova(religious.lm, test="Chisq")
# Convert the log odds for the coefficient on the predictor into regular odds
exp(coef(religious.lm))
## (Intercept) dptMiss polMiss mmrMiss varMiss
## 0.7814467 0.7812570 1.2560020 0.9540644 2.7153276
#confusion matrix
# table(round(predict(religious.lm,type="response")),reportSampleDF$religious)
options(scipen=999) # turn-off scientific notation like 1e+48
EnsurePackage("BayesFactor")
MCMCout_religious <- lmBF(religious ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF, posterior=TRUE, iterations=10000)
summary(MCMCout_religious)
##
## Iterations = 1:10000
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 10000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## mu 4.44532 0.32036 0.0032036 0.0032036
## dptMiss -0.24300 0.11225 0.0011225 0.0011225
## polMiss 0.22447 0.12251 0.0012251 0.0012251
## mmrMiss -0.04661 0.07621 0.0007621 0.0007621
## varMiss 0.99243 0.06819 0.0006819 0.0006819
## sig2 72.95072 3.87184 0.0387184 0.0393957
## g 0.34720 0.40820 0.0040820 0.0040820
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## mu 3.80448 4.23195 4.44384 4.661998 5.07339
## dptMiss -0.46085 -0.31910 -0.24355 -0.167167 -0.02233
## polMiss -0.01387 0.14281 0.22462 0.306061 0.46359
## mmrMiss -0.19657 -0.09847 -0.04732 0.003765 0.10419
## varMiss 0.85783 0.94773 0.99312 1.038333 1.12667
## sig2 65.73168 70.29508 72.82818 75.473907 81.01892
## g 0.07924 0.15516 0.23848 0.389960 1.29311
rsqList <- 1 - (MCMCout_religious[,"sig2"] / var(reportSampleDF$religious))
MeanrsqList <- mean(rsqList)
MeanrsqList
## [1] 0.4779657
quantile(rsqList,c(0.025))
## 2.5%
## 0.4202298
quantile(rsqList,c(0.975))
## 97.5%
## 0.5296251
religiousBF <- lmBF(religious ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF)
summary(religiousBF)
## Bayes factor analysis
## --------------
## [1] dptMiss + polMiss + mmrMiss + varMiss : 13924325715845296893236637991759097040796547804482437591555400957505265181385934113503608569856 ±0%
##
## Against denominator:
## Intercept only
## ---
## Bayes factor type: BFlinearModel, JZS
# histogram of 95% HDI mean differences on dptMiss | overlaps with 0
hist(MCMCout_religious[,"dptMiss"],col="#CBB43D")
abline(v=quantile(MCMCout_religious[,"dptMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_religious[,"dptMiss"],c(0.975)),col="green")
# histogram of 95% HDI mean differences on polMiss | overlaps with 0
hist(MCMCout_religious[,"polMiss"],col="#4DAFD4")
abline(v=quantile(MCMCout_religious[,"polMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_religious[,"polMiss"],c(0.975)),col="green")
# histogram of 95% HDI mean differences on mmrMiss | overlaps with 0
hist(MCMCout_religious[,"mmrMiss"],col="pink")
abline(v=quantile(MCMCout_religious[,"mmrMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_religious[,"mmrMiss"],c(0.975)),col="green")
# histogram of 95% HDI mean differences on varMiss | doesnt overlap with 0
hist(MCMCout_religious[,"varMiss"],col="orange")
abline(v=quantile(MCMCout_religious[,"varMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_religious[,"varMiss"],c(0.975)),col="green")
summary(reportSampleDF)
## code name pubpriv enrollment
## Min. : 100362 Length:698 Length:698 Min. : 10.00
## 1st Qu.:6015501 Class :character Class :character 1st Qu.: 42.25
## Median :6043137 Mode :character Mode :character Median : 75.00
## Mean :5588410 Mean : 76.11
## 3rd Qu.:6116305 3rd Qu.:105.00
## Max. :7103161 Max. :221.00
## allvaccs conditional medical religious
## Min. : 8.00 Min. : 0.000 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 86.17 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 93.33 Median : 3.030 Median : 0.0000 Median : 1.087
## Mean : 89.01 Mean : 6.931 Mean : 0.1759 Mean : 4.449
## 3rd Qu.: 97.46 3rd Qu.: 8.677 3rd Qu.: 0.0000 3rd Qu.: 4.353
## Max. :100.00 Max. :81.818 Max. :14.2857 Max. :162.500
## dptMiss polMiss mmrMiss hepMiss
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 1.699 1st Qu.: 1.504 1st Qu.: 1.429 1st Qu.: 0.722
## Median : 4.918 Median : 4.587 Median : 5.042 Median : 3.073
## Mean : 8.638 Mean : 8.229 Mean : 8.793 Mean : 5.909
## 3rd Qu.:10.960 3rd Qu.:10.507 3rd Qu.:11.111 3rd Qu.: 7.767
## Max. :81.250 Max. :81.250 Max. :81.250 Max. :79.070
## varMiss isPublicPrivate val
## Min. : 0.000 Min. :0.0000 Min. :1
## 1st Qu.: 0.000 1st Qu.:1.0000 1st Qu.:1
## Median : 2.485 Median :1.0000 Median :1
## Mean : 5.371 Mean :0.7822 Mean :1
## 3rd Qu.: 6.667 3rd Qu.:1.0000 3rd Qu.:1
## Max. :79.070 Max. :1.0000 Max. :1